<?php
/**
 * Created by JetBrains PhpStorm.
 * User: zhouyang
 * Date: 13-7-15
 * Time: 下午4:31
 * To change this template use File | Settings | File Templates.
 */

class CrawlAction extends Action{
    private $A_new;
    private $A_category;

    private $A_source;

    /**
     * 构造方法
     * 初始化
     */
    public function __construct(){
        $this->A_new = Action::getInstance('new');
        $this->A_category = Action::getInstance('category');

        $this->A_source = Action::getInstance('source');
    }

    public function index(){

        $category_list = $this->A_category->getCategorys();

        $data = array(
            'category_list' => $category_list
        );

        Template::show('Crawl/index.php',$data);

    }

    //编辑
    public function ajaxGetSource(){
        $mode = $this->input()->get('mode',1);

        if($mode == 1){
            $ret = $this->A_source->getSources();
        }else if($mode == 2){
            $ret = array(0=>array('source_id'=>4,'title'=>'36kr','site_address'=>'http://www.36kr.com'));
        }

        $data = array();
        if($ret){
            foreach((array)$ret as $k => $v){
                $data[$k][] = $v['source_id'];
                $data[$k][] = $v['title'];
            }
        }else{
            $data = array(array('all','请选择抓取来源'));
        }
        echo Response::Default_JSON(0,$data);
    }

    public function spiderSigle(){
        set_time_limit(0);
        $p_id = @$_GET['p_id'];

        $html = file_get_html('http://www.36kr.com/p/'.$p_id);

        $info = array();
        $keywords_dom = $html->find('meta[name=keywords]', 0);
        $info['keywords'] =  $keywords_dom->content;

        $description_dom = $html->find('meta[name=description]', 0);
        $info['description'] =  $description_dom->content;
        $info['summary'] =  $description_dom->content;

        $title_dom = $html->find('h1.entry-title', 0);
        $info['title'] = $title_dom->plaintext;

        $content_dom = $html->find('div.mainContent',0);
        $content = $content_dom->innertext;

        $info['content'] = $content;

        $preImg = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png|\.bmp|\.jpeg]))[\'|\"].*?[\/]?>/";
        preg_match_all($preImg, $content, $arr); //$content为原始文章内容，下同
        $info['picture'] = $arr[1];

        $tag = '';
        foreach ($html->find('a.tag') as $element){
            $tag .= !empty($tag) ? ',' . $element->plaintext : $element->plaintext;
        }

        $info['tag'] = $tag;


        $curr_category_dom = $html->find('span[itemprop=title]', 0);
        $info['curr_category'] = $curr_category_dom->plaintext;

        $html->clear();

        echo Response::Default_JSON(200,$info);

    }

    public function spiderBatch(){
        set_time_limit(0);
        $type = @$_GET['type'];
        $page = @$_GET['page'];
        if(isset($type) && $type != null){
            $html = file_get_html('http://www.36kr.com/category/'.$type.'?page='.$page);
            //$html = file_get_html('http://www.36kr.com/recommended'.'?page='.$page);
            $arr = array();

            //$new_info_dom = $html->find('div.'.$type);
            $new_info_dom = $html->find('div.blogPost');

            foreach($new_info_dom as $k=>$v){

                $info = array();

                $info['href'] = 'http://www.36kr.com'.$v->find('div.left_info div.feature_img a',0)->href;
                $info['src'] = $v->find('div.left_info div.feature_img a img',0)->src;

                $info['title'] = $v->find('div.right_info div.summary h4 a',0)->plaintext;

                $info['type'] = $v->find('div.right_info div.post_meta a',1)->plaintext;
                $info['time'] = date('Y-m-d H:i:s', strtotime($v->find('div.right_info div.post_meta abbr',0)->title));
                $info['summary'] = $v->find('div.right_info div.excerpt p',0)->plaintext;

                $arr[$k] = $info;

            }

            $html->clear();

            echo Response::Default_JSON(200,array_reverse($arr));
        }
    }

    public function ajaxAddNew(){
        set_time_limit(0);

        $info = array();
        $p_id = $_GET['id'];
        $type = $_GET['type'];
        $title = $_GET['title'];

        $check_title = $this->A_new->get($title);

        if($check_title != null){
            echo json_encode(array("status" => 2, "id" => $check_title));
            exit;
        }

        $info['published'] = strtotime($_GET['time']);

        $info['cid'] = $this->checkCategory($type);

        //信息抓取
        $html = file_get_html('http://www.36kr.com/p/'.$p_id);

        $keywords_dom = $html->find('meta[name=keywords]', 0);
        $info['keywords'] =  $keywords_dom->content;

        $description_dom = $html->find('meta[name=description]', 0);
        $info['description'] =  $description_dom->content;

        if(!empty($_GET['summary'])){
            $info['summary'] =  $_GET['summary'];
        }else{
            $info['summary'] =  $info['description'];
        }

        $content = '';
        foreach ($html->find('div.mainContent p') as $element){
            $content .= $element->outertext;
        }
        $info['content'] = rep_img_src($content,$_GET['time']);

        $preImg = "/<[img|IMG].*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.png|\.bmp|\.jpeg]))[\'|\"].*?[\/]?>/";
        preg_match_all($preImg, $info['content'], $arr); //$content为原始文章内容，下同

        $dir = 'Uploads/image/'.date('Ymd',strtotime($_GET['time'])).'/';
        if (!file_exists($dir.'thumb/')) {
            mkdir($dir.'thumb/',0777,true);
        }
        $picName = pathinfo($arr[1][0]);    //获取图片名
        $picName['basename'] = explode('!',$picName['basename']);
        $picName['basename'] = $picName['basename'][0];
        $file_name = date('YmdHis',strtotime($_GET['time'])) . '_' . rand(1,999999999) . '.' . $picName['extension'];
        $file_name = explode('!',$file_name);
        $file_name = $file_name[0];

        $retsult_status = imagezoom( $dir.$picName['basename'], $dir.'thumb/'.$file_name, 320, 200, '#FFFFFF');

        if($retsult_status != 1){
            $this->error("抓取新闻生成图片失败，请联系管理员检查！#".$retsult_status);
        }
        $info['picture'] = $dir.'thumb/'.$file_name;

        $tag = '';
        $tag_id_arr = array();
        foreach ($html->find('a.tag') as $element){
            $tag .= !empty($tag) ? ',' . $element->plaintext : $element->plaintext;

            $tag_id_arr[] = $this->checkTag($element->plaintext,$info['cid']);
        }

        $info['tag'] = $tag;

        $info['aid'] = $_SESSION['my_info']['aid'];

        //去除 36氪站 新闻
        if(strpos($info['title'],'36kr') || strpos($info['title'],'氪') || strpos($info['content'],'36kr') || strpos($info['content'],'氪')){
            $info['status'] = 0;
            //echo json_encode(array("status" => 1, "id" => "is 36kr new~"));
            //exit;
        }else{
            $info['status'] = 1;
        }

        $result = $news->add($info);

        if($result){
            foreach($tag_id_arr as $v){
                $this->checkTagNew($v,$result);
            }
        }

        echo json_encode(array("status" => 1, "id" => $result));

        $html->clear();

        exit;
    }
}